---
title: "Supplementary Materials for Name Popularity in the Gospels & Acts JSNT Paper"
author: "Jason Wilson"
date: "`r Sys.Date()`"
output:
  html_document:
    toc: yes
    toc_depth: 2
    number_sections: yes
---

```{r setup, include=FALSE}
# This is the version for the resubmission. Updates:
# - Updated names update (Ilan.rev)
# - Four additional P values calculated for Luuk
# - Added comments on the probability test (binomial test) of GB requested by the reviewer

knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(warning = FALSE)

load("~/Home Jason/5.0 Teaching/Statistics/R Files/Projects/VanDeWeghe_Gospel_Historicity/Gospels-Acts-Ilan2.RData") #Note: I updated this file with the revised Ilan data

```

```{r}
library(dplyr)
library(gridExtra)
library(tidyr)
library(ggformula)

#Global resampling variables
NREP = 10^4 #This is the number of replicates for all Uniform permutation tests.
            #It should only be set to 10^4
            #for the final run.  It dramatically increases the run-time.

GA.size = 52 #This is the sample size of our uniform simulation.  I set it as a global variable because otherwise I'd need to recalculate it inside of simulations for the CI calculations and I wanted to be able to change it, if necessary.

```
This file is the Supplementary Materials for paper, "Re-Examining Gregor and Blais’ Argument from Name Popularity in Light of Proper Statistical Methodology" by Jason Wilson and Luuk Van de Weghe, submitted to the **Journal for the Study of the New Testament**.  It contains the code used to generate the figures and tables in the paper, as well as the statistical tests used to check the results.


# Graph Name Frequencies
### Setting up the data
This is a very dirty calculation, but I want to get done and work on the writing, and this does the job, so I apologize for the messy code.
```{r}
# Grab some numbers
out = chi.square(Ilan.rev, GA3, breaks = c(1, 5, 22, 100, 150), remove=TRUE) 
#out2 = chi.square(Ilan.rev, GB3, breaks = c(1, 5, 22, 100, 150), remove=TRUE) 
ilan.rare.num = out[2][[1]]$Reference[1] # # rare names in Ilan1
GA.rare.num = out[2][[1]]$Target[1] # # rare names in GA


# Set up  the data.frame (very, very dirty alculations)
f.I =table(Ilan.rev$name)
ind = order(f.I, decreasing=TRUE)
f.I2 = data.frame(f.I[ind[1:12]])
names(f.I2) <- c('Name', 'Ilan1')

f.J =table(Josephus$name)
ind = order(f.J, decreasing=TRUE)
f.J2 = data.frame(f.J)
names(f.J2) <- c('Name', 'Josephus')

data1 = left_join(f.I2, GA[,1:2], by='Name')
data2 = left_join(data1, f.J2, by='Name')

rare = c('Rare', ilan.rare.num, GA.rare.num, 33)
data3 = rbind(rare, data2)
colnames(data3)[3] <- 'Gospels+Acts'
data3[12,3] = 0
#data3[,2:4] <- as.numeric(data3[,2:4])

data4 = data.frame(data3[,1], 
                   100*as.numeric(data3[,2])/2114, 
                   100*as.numeric(data3[,3])/82, 
                   100*as.numeric(data3[,4])/274)
names(data4) <- c('Name', 'Ilan1', 'Gospels+Acts', 'Josephus')

#Prep for plotting
data6 = data4 %>%
  pivot_longer(
    cols = !Name,
    names_to = 'Text',
    values_to = 'Proportion'
  )
data6$Name <- factor(data6$Name, levels = data4$Name[1:13])
data6$PlotKey = rep(1:13, each=3)
data7 = data6[-seq(3,39,by=3),]

```

Here is the plotting code:
```{r}
#tiff(file = "Barplot_Names_Bauckham.tiff", width = 1100, height = 550,res = 144, units = "px", pointsize = 12, compression = "lzw", bg = "white") 
filter(data7, Text=='Ilan1') %>% 
  gf_col(Proportion ~ Name) %>%
  gf_line(Proportion ~ PlotKey, color = ~Text, linewidth=1.25,
          data = filter(data7, Text=='Gospels+Acts' | Text=='Josephus')) %>%
  gf_point(Proportion ~ PlotKey, color = ~Text, size=2.25,
          data = filter(data7, Text=='Gospels+Acts' | Text=='Josephus')) %>%
  gf_theme(theme_bw()) %>%
  gf_refine(scale_color_manual(values="yellow2")) %>%
  gf_theme(legend.position="none") %>%
  gf_labs(x = "Names", y = "% of Occurrences of Name") +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 13))
#dev.off()
```

```{r}
tiff(file = "Double_Lineplot_GB.tiff", width = 1100, height = 550,res = 144, units = "px", pointsize = 12, compression = "lzw", bg = "white")
data8 = filter(data6, Text=='Ilan1')
lm1 = lm(Proportion ~ PlotKey + I(PlotKey^2), data = data8)
x = seq(1,13,0.1)
y = predict(lm1, newdata = data.frame(PlotKey = x))
plot(x,y, ylim=c(0,13), type='l', col='green4', lwd=2, xlab='Names', ylab='% of Occurrences of Name')
grid()
points(x,rep(1,times=length(x)), type='l', col='red3', lwd=2, lty=2)
legend('topright', legend=c('Historical', 'Anonymous Community Transmission'), col=c('green4', 'red3'), lty=c(1,2), lwd=2)
dev.off()
```

# Binomial Calculations
In Gregor & Blais (2023) they formally used a Monte Carlo method for this part.  However, from what I coudl tell, they assumed independence in their calculations, which therefore meet the assumption needed for the binomial distribution.  The checking of binomial results with theirs were similar, therefore, I opted for the binomial distribution as an easier and clearer way to approximate their results.  Here they are:
```{r}
1-pbinom(7, size=82, prob=184/2186) #Unbiased
1-pbinom(7, size=53, prob=184/2186) #GB
1-pbinom(7, size=33, prob=184/2186) #hypothetical 20 less contested

1-pbinom(5, size=53, prob=0.0782) #Joseph
pbinom(1, size=53, prob=0.0618) #Eleazar
1-pbinom(2, size=53, prob=0.0576) #Judas
```
If anyone reads the above and wonders about "1-" vs. no "1-" for Eleazar, it's because GB used one-tailed p-values, even though their alternative hypothesis was two-tailed.  If we used two-tailed p-values, then the p-values for all of the above would double, further weakening their case.

# p-values for Luuk's parts
## The function
```{r}
chi.square <- function(Ilan, Target, breaks = c(1, 4, 20, 100, 150), remove = c(FALSE, TRUE, 'partial'), MAD.RMS = FALSE)
{  #CHI.SQUARE is the master function which takes the data in case format and runs both a chi-square goodness-of-fit test of Target with the Reference, as well as removes the Target from the Reference and performs a chi-square test of independence between the two.

#Label the frequency groups
Ilan.name.freq = data.frame(table(Ilan$name), Category=NA)
colnames(Ilan.name.freq) <- c('Name', 'Freq', 'Category')
Ilan.name.freq$Category = ifelse(Ilan.name.freq$Freq<=breaks[1], 'F1', Ilan.name.freq$Category)
Ilan.name.freq$Category = ifelse(Ilan.name.freq$Freq>breaks[1] & Ilan.name.freq$Freq<=breaks[2], 'F2', Ilan.name.freq$Category)
Ilan.name.freq$Category = ifelse(Ilan.name.freq$Freq>breaks[2] & Ilan.name.freq$Freq<=breaks[3], 'F3', Ilan.name.freq$Category)
Ilan.name.freq$Category = ifelse(Ilan.name.freq$Freq>breaks[3] & Ilan.name.freq$Freq<=breaks[4], 'F4', Ilan.name.freq$Category)
Ilan.name.freq$Category = ifelse(Ilan.name.freq$Freq>breaks[4] & Ilan.name.freq$Freq<=breaks[5], 'F5', Ilan.name.freq$Category)
Ilan.name.freq$Category = ifelse(Ilan.name.freq$Freq>breaks[5], 'F6', Ilan.name.freq$Category)

#Obtain Reference frequencies
f.Ilan = Ilan.name.freq %>% 
  group_by(Category) %>%
  summarize(Frequency = sum(Freq))

#Pull the name lists
names.F1 = filter(Ilan.name.freq, Category=='F1')$Name
names.F2 = filter(Ilan.name.freq, Category=='F2')$Name
names.F3 = filter(Ilan.name.freq, Category=='F3')$Name
names.F4 = filter(Ilan.name.freq, Category=='F4')$Name
names.F5 = filter(Ilan.name.freq, Category=='F5')$Name
names.F6 = filter(Ilan.name.freq, Category=='F6')$Name

#Cross-check against target
Tar.name.freq = data.frame(table(Target$name), Category=NA)
colnames(Tar.name.freq) <- c('Name', 'Freq', 'Category')

Tar.name.freq$Category = ifelse(is.na(match(Tar.name.freq$Name, names.F1)), Tar.name.freq$Category, 'F1')
Tar.name.freq$Category = ifelse(is.na(match(Tar.name.freq$Name, names.F2)), Tar.name.freq$Category, 'F2')
Tar.name.freq$Category = ifelse(is.na(match(Tar.name.freq$Name, names.F3)), Tar.name.freq$Category, 'F3')
Tar.name.freq$Category = ifelse(is.na(match(Tar.name.freq$Name, names.F4)), Tar.name.freq$Category, 'F4')
Tar.name.freq$Category = ifelse(is.na(match(Tar.name.freq$Name, names.F5)), Tar.name.freq$Category, 'F5')
Tar.name.freq$Category = ifelse(is.na(match(Tar.name.freq$Name, names.F6)), Tar.name.freq$Category, 'F6')
Tar.name.freq$Category = ifelse(is.na(Tar.name.freq$Category), 'F1', Tar.name.freq$Category) #Any missing names go 'rare'=F1

#Obtain Target frequencies
f.Tar = Tar.name.freq %>% 
  group_by(Category) %>%
  summarize(Frequency = sum(Freq))

#Ad hoc addition needed for the bootstrap test when small samples occur
if(nrow(f.Tar)<6)
{
  f.Tar2 = data.frame(Category = c('F1', 'F2', 'F3', 'F4', 'F5', 'F6'),
                      Frequency = rep(0, times=6))
  data = full_join(f.Tar, f.Tar2, by='Category', relationship='many-to-one')
  data$Category <- factor(data$Category, levels = f.Tar2$Category, ordered=TRUE)
  ind = order(data$Category)
  f.Tar2$Frequency = ifelse(is.na(data[ind,]$Frequency.x), 0, data[ind,]$Frequency.x)
  f.Tar = f.Tar2
}

###Goodness-of-fit test (2 cases: no remove & remove)
# For printing
B = breaks
breaks2 = c(paste('<=',B[1]), 
            paste(B[1]+1,'-',B[2]), 
            paste(B[2]+1,'-',B[3]), 
            paste(B[3]+1,'-',B[4]), 
            paste(B[4]+1,'-',B[5]), 
            paste('>',B[5]))

# Case 1: No remove
if(remove == FALSE)
{
  out1 = chisq.test(f.Tar$Frequency, p = f.Ilan$Frequency/sum(f.Ilan$Frequency), correct = FALSE)
  #print(out1)
  
  out2 = data.frame(Frequency = breaks2, Reference = f.Ilan$Frequency, Target = f.Tar$Frequency, Expected = out1$expected, ChiSq = (out1$observed - out1$expected)^2 / out1$expected)
  #print(out2)
  
  if(MAD.RMS == TRUE)
  {
    out2b = c(MAD = sum(abs(f.Ilan$Frequency[1:6] - rep(sum(f.Ilan$Frequency[1:6])/6, times=6)))/6 ,
              RMS = sqrt(sum((f.Ilan$Frequency[1:6] - rep(sum(f.Ilan$Frequency[1:6])/6, times=6))^2)) )
    print(paste('The MAD is', out2b[1], 'The RMS is', out2b[2]) )
  }#end if
  
}#end if

# Case 2: Remove
if(remove == TRUE)
{
  #Removal
  Tar.name.freq$Freq = -Tar.name.freq$Freq #Set to negative to use "sum" function
  Combined.name.freq = full_join(Ilan.name.freq, Tar.name.freq, by='Name')
  Freq.reduced = apply(Combined.name.freq[,c(2,4)], MARGIN = 1, sum, na.rm=TRUE)
  Combined.name.freq2 = data.frame(Combined.name.freq, Freq.reduced)
  
  # Re-run frequency categories for Ilan
  f.Ilan2 = Combined.name.freq2 %>% 
    group_by(Category.x) %>%
    summarize(Frequency = sum(Freq.reduced))
  
  #Goodness-of-fit
  out1 = chisq.test(f.Tar$Frequency, p = f.Ilan2$Frequency[1:6]/sum(f.Ilan2$Frequency[1:6]), correct = FALSE)
  #print(out1)
  
  out2 = data.frame(Frequency = breaks2, Reference = f.Ilan2$Frequency[1:6], Target = f.Tar$Frequency, Expected = out1$expected, ChiSq = (out1$observed - out1$expected)^2 / out1$expected)
  #print(out2)
  
  if(MAD.RMS == TRUE)
  {
    out2b = c(MAD = sum(abs(f.Ilan$Frequency[1:6] - rep(sum(f.Ilan$Frequency[1:6])/6, times=6)))/6 ,
              RMS = sqrt(sum((f.Ilan$Frequency[1:6] - rep(sum(f.Ilan$Frequency[1:6])/6, times=6))^2)) )
    print(paste('The MAD is', out2b[1], 'The RMS is', out2b[2]) )
  }#end if
  
 }#end if

# Case 3: Partial Remove
# This was added to deal with partial occurrence of historical characters in Ben Hur and other such documents.

if(remove == 'partial')
{
  #Cross-check against target
  Tar.name.freq.p = data.frame(table(Target$In.Ilan1), Category=NA)
  colnames(Tar.name.freq.p) <- c('Name', 'Freq', 'Category')
  
  Tar.name.freq.p$Category = ifelse(is.na(match(Tar.name.freq.p$Name, names.F1)), Tar.name.freq.p$Category, 'F1')
  Tar.name.freq.p$Category = ifelse(is.na(match(Tar.name.freq.p$Name, names.F2)), Tar.name.freq.p$Category, 'F2')
  Tar.name.freq.p$Category = ifelse(is.na(match(Tar.name.freq.p$Name, names.F3)), Tar.name.freq.p$Category, 'F3')
  Tar.name.freq.p$Category = ifelse(is.na(match(Tar.name.freq.p$Name, names.F4)), Tar.name.freq.p$Category, 'F4')
  Tar.name.freq.p$Category = ifelse(is.na(match(Tar.name.freq.p$Name, names.F5)), Tar.name.freq.p$Category, 'F5')
  Tar.name.freq.p$Category = ifelse(is.na(match(Tar.name.freq.p$Name, names.F6)), Tar.name.freq.p$Category, 'F6')
  Tar.name.freq.p$Category = ifelse(is.na(Tar.name.freq.p$Category), 'F1', Tar.name.freq.p$Category) #Any missing names go 'rare'=F1
  
  #Obtain Target frequencies
  f.Tar.p = Tar.name.freq.p %>% 
    group_by(Category) %>%
    summarize(Frequency = sum(Freq))
  
  #Removal
  Tar.name.freq.p$Freq = -Tar.name.freq.p$Freq #Set to negative to use "sum" function
  Combined.name.freq = full_join(Ilan.name.freq, Tar.name.freq.p, by='Name')
  Freq.reduced = apply(Combined.name.freq[,c(2,4)], MARGIN = 1, sum, na.rm=TRUE)
  Combined.name.freq2 = data.frame(Combined.name.freq, Freq.reduced)
  
  # Re-run frequency categories for Ilan
  f.Ilan2 = Combined.name.freq2 %>% 
    group_by(Category.x) %>%
    summarize(Frequency = sum(Freq.reduced))
  
  #Goodness-of-fit
  out1 = chisq.test(f.Tar$Frequency, p = f.Ilan2$Frequency[1:6]/sum(f.Ilan2$Frequency[1:6]), correct = FALSE)
  #print(out1)
  
  out2 = data.frame(Frequency = breaks2, Reference = f.Ilan2$Frequency[1:6], Target = f.Tar$Frequency, Expected = out1$expected, ChiSq = (out1$observed - out1$expected)^2 / out1$expected)
  #print(out2)
  
  if(MAD.RMS == TRUE)
  {
    out2b = c(MAD = sum(abs(f.Ilan$Frequency[1:6] - rep(sum(f.Ilan$Frequency[1:6])/6, times=6)))/6 ,
              RMS = sqrt(sum((f.Ilan$Frequency[1:6] - rep(sum(f.Ilan$Frequency[1:6])/6, times=6))^2)) )
    print(paste('The MAD is', out2b[1], 'The RMS is', out2b[2]) )
  }#end if
  
 }#end if
  

###Test of independence
# Case 1: No removal (direct comparison)
if(remove == FALSE)
{
  out3 = chisq.test(data.frame(f.Tar$Frequency, f.Ilan$Frequency), correct = FALSE)
  out4 = data.frame(Frequency = breaks2, Target = f.Tar$Frequency, Reference = f.Ilan$Frequency, Expected = out3$expected, ChiSq = (out3$observed - out3$expected)^2 / out3$expected)

  #print(out3)
  return(list(out1, out2, out3, out4))
}#end if

# Case 2: Removal (Remove Target from Reference)
if(remove == TRUE)
{
  out3 = chisq.test(data.frame(f.Tar$Frequency, f.Ilan2$Frequency[1:6]), correct = FALSE)
  out4 = data.frame(Frequency = breaks2, Target = f.Tar$Frequency, Reference = f.Ilan2$Frequency[1:6], Expected = out3$expected, ChiSq = (out3$observed - out3$expected)^2 / out3$expected)
  #print(out3)
  return(list(out1, out2, out3, out4))
}#end if

# Case 3: Partial Removal (Remove Partial Target from Reference)
if(remove == 'partial')
{
  out3 = chisq.test(data.frame(f.Tar$Frequency, f.Ilan2$Frequency[1:6]), correct = FALSE)
  out4 = data.frame(Frequency = breaks2, Target = f.Tar$Frequency, Reference = f.Ilan2$Frequency[1:6], Expected = out3$expected, ChiSq = (out3$observed - out3$expected)^2 / out3$expected)
  #print(out3)
  return(list(out1, out2, out3, out4))
}#end if

}#end function
```

## Apocrypha 1
```{r}
# Load main data
library(readr)
Apocrypha <- read_csv("GBs_Apocryphal_Corpus.csv")

out = chi.square(Ilan.rev, Apocrypha, breaks = c(1, 5, 22, 100, 150), remove=TRUE) #p-value = 4.173e-05
out[2]
```

```{r}
name <- c(
  "Aeneas", "Benjamin", "Eleazar", "Eleazar", "Elisha", "Hagaba",
  "Hananiah", "Joseph", "Micha", "Reuben", "Zachariah", "Zachariah", "Zephaniah"
)

clementine = data.frame(1:13,name)

chi.square(Ilan.rev, clementine, breaks=c(1, 5, 22, 100, 150), remove=FALSE) #p-value = 0.4139
```


## Apocrypha 2 (Remove Adi)
```{r}
Apocrypha2 = Apocrypha
Apocrypha2[30, 2] = 'Eleazar'
Apocrypha2 = Apocrypha2[Apocrypha2$name != 'Adi',]

out2 = chi.square(Ilan.rev, Apocrypha2, breaks = c(1, 5, 22, 100, 150), remove=TRUE) #p-value = 0.0004047
out2[2]


```

## Babylonian Talmud
```{r}
# Load main data
BabTalmud <- read_csv("Bab_Talmud_Males.csv")

out = chi.square(Ilan.rev, BabTalmud, breaks = c(1, 5, 22, 100, 150), remove=TRUE) #p-value = 0.3161
out
out[2]


```

## Historical Novels vs. Ilan-1
```{r}
Pop.HistNov.Ilan1 = chi.square(Ilan.rev, HistNov, breaks = c(1, 5, 22, 100, 150), remove=FALSE)
Pop.HistNov.Ilan1
```

Below is The Spear alone:
```{r}
#The Spear
#chi.square(Ilan, HistNov[33:98,], breaks=c(1, 5, 21, 70, 150), remove='partial')
```

Below is Ben Hur alone.  Six bins works.
```{r}
name <- c(
  "Herod", "Joseph", "Samuel", "Judah", "Yaqim", "Hanan", "Hillel", "Simon",
  "Antipatrus", "Philip", "Archelaus", "Ishmael", "Phiabi", "Yoezer", "Seth",
  "Judah", "Samuel", "Abtalion", "Gamaliel", "Simon", "Ithamar", "Hur", "Malka",
  "Abimelech", "Qaifa", "Yohanan", "Zachariah", "Herod", "Eleazar", "Judah", "Joshua"
)
ben.hur = data.frame(1:31,name)

chi.square(Ilan.rev, ben.hur, breaks=c(1, 5, 22, 100, 150), remove=FALSE)
```


## Check Ossuary/Literary/Other
In order to check the ossuaries, we need to split the Ilan.rev data into three parts: literary names, ossuary names, and other inscriptions/papyri.  We then run chi-square tests on each of these separately.

In order to do that, we have to revise the breaks.  As such, we'll use the same protocol as above - namely roughly even bin frequencies for Ilan.  Because the sample sizes and distributions for ossuary, literary, and other are different, we need to use different been frequencies for each. This becomes apparent as one begins to explore the data.

The thing to watch out for is that throughout the Gospels-Acts match virtually all bins - except for ossuary rare names. This is the standout mismatch that appears to indicate a difference between both Gospels and acts versus ossuaries, as well as potentially ossuaries vs. other distributions.  Further research is warranted on this.

### Bin Checking Function
The purpose of the bin-checking function is to permit the user to provide a set of bins and get the MAD and RMS values as output.  They can then be used to determine how good the bins are.  The function is not written for general-purpose use, but for the specific purpose of this project.
```{r}
bin.check <- function(Ilan, Target, breaks = c(1, 4, 20, 100, 150), remove = FALSE)
{

#Label the frequency groups
Ilan.name.freq = data.frame(table(Ilan$name), Category=NA)
colnames(Ilan.name.freq) <- c('Name', 'Freq', 'Category')
Ilan.name.freq$Category = ifelse(Ilan.name.freq$Freq<=breaks[1], 'F1', Ilan.name.freq$Category)
Ilan.name.freq$Category = ifelse(Ilan.name.freq$Freq>breaks[1] & Ilan.name.freq$Freq<=breaks[2], 'F2', Ilan.name.freq$Category)
Ilan.name.freq$Category = ifelse(Ilan.name.freq$Freq>breaks[2] & Ilan.name.freq$Freq<=breaks[3], 'F3', Ilan.name.freq$Category)
Ilan.name.freq$Category = ifelse(Ilan.name.freq$Freq>breaks[3] & Ilan.name.freq$Freq<=breaks[4], 'F4', Ilan.name.freq$Category)
Ilan.name.freq$Category = ifelse(Ilan.name.freq$Freq>breaks[4] & Ilan.name.freq$Freq<=breaks[5], 'F5', Ilan.name.freq$Category)
Ilan.name.freq$Category = ifelse(Ilan.name.freq$Freq>breaks[5], 'F6', Ilan.name.freq$Category)

#Obtain Reference frequencies
f.Ilan = Ilan.name.freq %>% 
  group_by(Category) %>%
  summarize(Frequency = sum(Freq))

#Pull the name lists
names.F1 = filter(Ilan.name.freq, Category=='F1')$Name
names.F2 = filter(Ilan.name.freq, Category=='F2')$Name
names.F3 = filter(Ilan.name.freq, Category=='F3')$Name
names.F4 = filter(Ilan.name.freq, Category=='F4')$Name
names.F5 = filter(Ilan.name.freq, Category=='F5')$Name
names.F6 = filter(Ilan.name.freq, Category=='F6')$Name

#Cross-check against target
Tar.name.freq = data.frame(table(Target$name), Category=NA)
colnames(Tar.name.freq) <- c('Name', 'Freq', 'Category')

Tar.name.freq$Category = ifelse(is.na(match(Tar.name.freq$Name, names.F1)), Tar.name.freq$Category, 'F1')
Tar.name.freq$Category = ifelse(is.na(match(Tar.name.freq$Name, names.F2)), Tar.name.freq$Category, 'F2')
Tar.name.freq$Category = ifelse(is.na(match(Tar.name.freq$Name, names.F3)), Tar.name.freq$Category, 'F3')
Tar.name.freq$Category = ifelse(is.na(match(Tar.name.freq$Name, names.F4)), Tar.name.freq$Category, 'F4')
Tar.name.freq$Category = ifelse(is.na(match(Tar.name.freq$Name, names.F5)), Tar.name.freq$Category, 'F5')
Tar.name.freq$Category = ifelse(is.na(match(Tar.name.freq$Name, names.F6)), Tar.name.freq$Category, 'F6')
Tar.name.freq$Category = ifelse(is.na(Tar.name.freq$Category), 'F1', Tar.name.freq$Category) #Any missing names go 'rare'=F1

#Obtain Target frequencies
f.Tar = Tar.name.freq %>% 
  group_by(Category) %>%
  summarize(Frequency = sum(Freq))
  
###Goodness-of-fit test (2 cases: no remove & remove)
# For printing
B = breaks
breaks2 = c(paste('<=',B[1]), 
            paste(B[1]+1,'-',B[2]), 
            paste(B[2]+1,'-',B[3]), 
            paste(B[3]+1,'-',B[4]), 
            paste(B[4]+1,'-',B[5]), 
            paste('>',B[5]))

# Case 1: No remove
if(remove == FALSE)
{
  out1 = chisq.test(f.Tar$Frequency, p = f.Ilan$Frequency/sum(f.Ilan$Frequency), correct = FALSE)
  #print(out1)
  
  out2 = data.frame(Frequency = breaks2, Reference = f.Ilan$Frequency, Target = f.Tar$Frequency, Expected = out1$expected, ChiSq = (out1$observed - out1$expected)^2 / out1$expected)
  #print(out2)
  
  out2b = c(MAD = sum(abs(f.Ilan$Frequency[1:6] - rep(sum(f.Ilan$Frequency[1:6])/6, times=6)))/6 ,
            RMS = sqrt(sum((f.Ilan$Frequency[1:6] - rep(sum(f.Ilan$Frequency[1:6])/6, times=6))^2)) )

}#end if

# Case 2: Remove
if(remove == TRUE)
{
  #Removal
  Tar.name.freq$Freq = -Tar.name.freq$Freq #Set to negative to use "sum" function
  Combined.name.freq = full_join(Ilan.name.freq, Tar.name.freq, by='Name')
  Freq.reduced = apply(Combined.name.freq[,c(2,4)], MARGIN = 1, sum, na.rm=TRUE)
  Combined.name.freq2 = data.frame(Combined.name.freq, Freq.reduced)
  
  # Re-run frequency categories
  f.Ilan2 = Combined.name.freq2 %>% 
    group_by(Category.x) %>%
    summarize(Frequency = sum(Freq.reduced))
 
  #Goodness-of-fit
  out1 = chisq.test(f.Tar$Frequency, p = f.Ilan2$Frequency[1:6]/sum(f.Ilan2$Frequency[1:6]), correct = FALSE)
  #print(out1)
  
  out2 = data.frame(Frequency = breaks2, Reference = f.Ilan2$Frequency[1:6], Target = f.Tar$Frequency, Expected = out1$expected, ChiSq = (out1$observed - out1$expected)^2 / out1$expected)
  #print(out2)
  
  out2b = c(MAD = sum(abs(f.Ilan2$Frequency[1:6] - rep(sum(f.Ilan2$Frequency[1:6])/6, times=6)))/6 ,
            RMS = sqrt(sum((f.Ilan2$Frequency[1:6] - rep(sum(f.Ilan2$Frequency[1:6])/6, times=6))^2)) )
}#end if
  
return( list(breaks, round(out2b, 1)) )


}#end function
```

### Check bins Ossuary
```{r}
#Load Data
Ilan.literary = Ilan.rev[1:756,] #Literary names only
Ilan.ossuary = Ilan.rev[906:1516,] #Ossuary names only
Ilan.other = Ilan.rev[c(757:905, 1517:2114),] #other inscriptions+papyri

#Check
sort(table(Ilan.ossuary$name)) #Max is 64 Simons, 55 Josephs, etc.
length(table(Ilan.ossuary$name)) #173
length(table(Ilan.rev$name)) #406
length(table(Ilan$name)) #457

bin.check(Ilan.ossuary, GA3, breaks = c(1, 2, 15, 25, 50), remove=FALSE) 
bin.check(Ilan.ossuary, GA3, breaks = c(1, 3, 15, 25, 50), remove=FALSE) 
bin.check(Ilan.ossuary, GA3, breaks = c(1, 4, 15, 25, 50), remove=FALSE) #MAD 13.6, RMS 36.5
bin.check(Ilan.ossuary, GA3, breaks = c(1, 5, 15, 25, 50), remove=FALSE) 
bin.check(Ilan.ossuary, GA3, breaks = c(1, 6, 15, 25, 50), remove=FALSE) 

bin.check(Ilan.ossuary, GA3, breaks = c(1, 5, 21, 40, 60), remove=FALSE) #MAD 12.6, RMS 44.4
bin.check(Ilan.ossuary, GA3, breaks = c(1, 4, 11, 24, 50), remove=FALSE) #MAD 11.6, RMS 37.4
```

### Check bins Literary
```{r}
sort(table(Ilan.literary$name)) #Max is 58 Simons, 54 Josephs, etc.
length(table(Ilan.literary$name)) #201
length(table(Ilan.rev$name)) #406
length(table(Ilan$name)) #457

bin.check(Ilan.literary, GA3, breaks = c(1, 7, 15, 31, 54), remove=FALSE) 
bin.check(Ilan.literary, GA3, breaks = c(1, 7, 15, 31, 53), remove=FALSE) #MAD 29.3, RMS 96.4
bin.check(Ilan.literary, GA3, breaks = c(2, 6, 15, 31, 53), remove=FALSE) #MAD 22.7, RMS 73.9
bin.check(Ilan.literary, GA3, breaks = c(1, 4, 15, 25, 53), remove=FALSE) #MAD 26.3, RMS 72.6
bin.check(Ilan.literary, GA3, breaks = c(1, 3, 16, 31, 53), remove=FALSE) 

bin.check(Ilan.literary, GA3, breaks = c(1, 3, 10, 31, 53), remove=FALSE) #MAD 15.3, RMS 40.0
bin.check(Ilan.literary, GA3, breaks = c(1, 4, 11, 31, 53), remove=FALSE) #MAD 12.3, RMS 32.4

#chi.square(Ilan.literary, GA3, breaks = c(1, 4, 11, 31, 53), remove=FALSE) #p-value = 0.4814
```

### Check bins Other
```{r}
sort(table(Ilan.other$name)) #Max is 67 Simons, 65 Josephs, etc.
length(table(Ilan.other$name)) #215
length(table(Ilan.rev$name)) #406
length(table(Ilan$name)) #457

bin.check(Ilan.other, GA3, breaks = c(1, 6, 12, 29, 50), remove=FALSE) #MAD 17.2, RMS 46.9
bin.check(Ilan.other, GA3, breaks = c(1, 5, 15, 31, 53), remove=FALSE) #MAD 14.2, RMS 48.6
bin.check(Ilan.other, GA3, breaks = c(1, 5, 12, 29, 50), remove=FALSE) #MAD  9.2, RMS 26.6

chi.square(Ilan.other, GA3, breaks = c(1, 5, 12, 29, 50), remove=FALSE) #p-value = 0.0571
```


### Tests
```{r}
GB3 = GB[GB$gospels_acts_unattested==1,]
out = chi.square(Ilan.literary, GB3, breaks = c(1, 4, 11, 31, 53), remove=TRUE) #p-value = 0.2701
out

out = chi.square(Ilan.ossuary, GB3, breaks = c(1, 4, 15, 25, 50), remove=FALSE) #p-value = 0.3093
out

out = chi.square(Ilan.other, GB3, breaks = c(1, 5, 12, 29, 50), remove=FALSE) #p-value = 0.5753
out
```
### Chi-square tests for Ilan.rev vs. Literary, Other, and Ossuary
Taking a further look at the three different groups
```{r}
out1 = chi.square(Ilan.rev, Ilan.literary, breaks = c(1, 5, 22, 100, 150), remove=TRUE) 
out2 = chi.square(Ilan.rev, Ilan.other, breaks = c(1, 5, 22, 100, 150), remove=TRUE) 
out3 = chi.square(Ilan.rev, Ilan.ossuary, breaks = c(1, 5, 22, 100, 150), remove=TRUE) 

r1 = out1[2][[1]]$Target
r2 = out2[2][[1]]$Target
r3 = out3[2][[1]]$Target
data = cbind(r1,r2,r3)
rownames(data) = out1[2][[1]]$Frequency
colnames(data) = c("Literary", "Other", "Ossuary")
data
round(prop.table(data, margin=2),2)
chisq.test(data, correct = FALSE)
```




